Setup

rm(list=ls())
knitr::opts_chunk$set(echo = TRUE)
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(ggbiplot))
suppressPackageStartupMessages(library(factoextra))
source("~/Dropbox/Documents/Github_mining/src/R/project_info.R")
repo_data_main <- load_repo_features(saved_repo_features_main)
repo_data_high_prof <- load_repo_features(saved_repo_features_high_prof)

Interpretation

Intuition for eigenvalues and eigenvectors

From here and here:

The proportion of the variation in the data variables that is explained by a PC is equal to that component’s associated eigenvalue divided by the sum of all eigenvalues.

An eigenvalue > 1 indicates that PCs account for more variance than accounted by one of the original variables in standardized data. This is commonly used as a cutoff point for which PCs are retained.

You can also limit the number of component to that number that accounts for a certain fraction of the total variance. For example, if you are satisfied with 80% of the total variance explained then use the number of components to achieve that.

Note that, a good dimension reduction is achieved when the the first few PCs account for a large proportion of the variability (80-90%).

The column labeled PC1 is the eigenvector of the data covariance matrix associated with the largest eigenvalue. Its elements are the coefficients or loadings of each original variable on the first PC. It matters if the loadings have opposite signs, but not which is positive and which is negative. The magnitudes of the loadings are also important.

From here:

PC scores: Also called component scores in PCA, these scores are the scores of each case (row) on each factor (column). To compute the factor score for a given case for a given factor, one takes the case’s standardized score on each variable, multiplies by the corresponding factor loading of the variable for the given factor, and sums these products.

Preprocess the data

# Combine repo data from both datasets
repo_data_all <- rbind(repo_data_high_prof %>% mutate(is_high_profile = TRUE), 
                       repo_data_main %>% select(-contains("topic")) %>% mutate(is_high_profile = FALSE))
# Get numeric columns
repo_data_numeric <- repo_data_all[,sapply(repo_data_all, is.numeric)]
# Replace NA's by the column median
for(j in 1:ncol(repo_data_numeric)) {
  if(is.numeric(repo_data_numeric[,j])){
    med <- median(repo_data_numeric[,j], na.rm = TRUE)
    for(i in 1:nrow(repo_data_numeric)) {
      if(is.na(repo_data_numeric[i,j])) {
        repo_data_numeric[i,j] <- med
      }
    }
  }
}
# Apply transformations
repo_data_numeric <- data.frame(scale(repo_data_numeric, center = TRUE, scale = TRUE))

Run PCA

pca_res <- princomp(repo_data_numeric, cor = TRUE, scores = TRUE)

How many PC’s explain most of the variance?

# Proportion of variance explained by each PC
prop_var <- (pca_res$sdev)^2 / sum(pca_res$sdev^2)
cumulative_prop_var <- cumsum(prop_var)
ncomp_min_prop_var <- function(cumulative_prop_var, p) {
  for(i in 1:length(cumulative_prop_var)) {
    if(cumulative_prop_var[i] > p) return(i)
  }}
# Keep PCs that explain most of the variance
ncomp_keep <- ncomp_min_prop_var(cumulative_prop_var, 0.8)

# Number of components to keep
ncomp_keep
## [1] 43

Make scree plots

# Proportion of variance explained
fviz_screeplot(pca_res, ncp = ncomp_keep)

# Eigenvalues
# Horizontal line for eigenvalue = 1
fviz_screeplot(pca_res, ncp = ncomp_keep, choice = "eigenvalue") + geom_hline(yintercept = 1, col = "red")

Look at coefficients

# Display the top n variables for each PC
for(j in 1:ncomp_keep) {
  message(paste("PC", j))
  coefs <- loadings(pca_res)[,j]
  ord <- order(abs(coefs), decreasing = TRUE)
  pr <- coefs[ord[1:20]]
  print(data.frame(coef = pr))
}
## PC 1
##                                                    coef
## total_bytes_no_data_procedural               -0.1276308
## total_lines_code_and_comment_no_data         -0.1227921
## total_lines_comment_no_data                  -0.1216897
## total_bytes_no_data_imperative               -0.1216747
## total_lines_comment                          -0.1212459
## total_lines_of_code_no_data                  -0.1211968
## total_lines_code_and_comment                 -0.1182910
## total_file_size_no_data                      -0.1181329
## total_bytes_no_data_compiled                 -0.1177452
## total_bytes_no_data_object_oriented          -0.1147552
## total_lines_of_code                          -0.1142718
## total_files_procedural                       -0.1137651
## total_bytes_no_data_functional_impure        -0.1127555
## total_bytes_no_data_compatibility_nominative -0.1124212
## total_bytes_no_data_type_system_static       -0.1105369
## total_files_compiled                         -0.1102068
## total_files_imperative                       -0.1094048
## num_langs_test_cases_no_data                 -0.1090291
## num_langs                                    -0.1083831
## total_files_object_oriented                  -0.1080914
## PC 2
##                                               coef
## total_bytes_no_data_type_system_safe    -0.1288695
## total_bytes_no_data_type_system_dynamic -0.1261959
## total_bytes_no_data_interpreted         -0.1244512
## total_bytes_no_data_compatibility_duck  -0.1236843
## bytes_JavaScript                        -0.1220937
## total_files_type_system_safe            -0.1218642
## total_lines_code_and_comment_JavaScript -0.1196745
## total_lines_of_code_JavaScript          -0.1184044
## total_files_type_system_dynamic         -0.1142995
## mean_bytes_m4                            0.1135142
## max_lines_code_and_comment_m4            0.1103312
## max_lines_code_m4                        0.1102678
## total_lines_comment_JavaScript          -0.1098781
## mean_lines_code_and_comment_m4           0.1081187
## mean_lines_code_m4                       0.1078653
## total_files_interpreted                 -0.1068622
## total_files_compatibility_duck          -0.1066592
## max_lines_code_and_comment_Bourne_Shell  0.1037132
## max_lines_code_Bourne_Shell              0.1035287
## pct_files_no_data_object_oriented       -0.1027611
## PC 3
##                                                  coef
## pct_files_no_data_compatibility_duck        0.1560800
## pct_bytes_no_data_compatibility_duck        0.1532218
## pct_files_no_data_type_system_dynamic       0.1521585
## pct_files_no_data_interpreted               0.1511942
## pct_files_no_data_type_system_static       -0.1500162
## pct_bytes_no_data_type_system_dynamic       0.1479193
## pct_bytes_no_data_interpreted               0.1470214
## pct_bytes_no_data_type_system_static       -0.1462854
## pct_files_no_data_compatibility_nominative -0.1420657
## pct_bytes_no_data_compatibility_nominative -0.1411327
## pct_bytes_no_data_Java                     -0.1358078
## mean_bytes_per_line_code_and_comment_Java  -0.1203335
## pct_bytes_no_data_R                         0.1145398
## pct_lines_comment_Java                     -0.1141904
## total_lines_code_and_comment_Java          -0.1115311
## max_lines_code_Java                        -0.1108032
## total_lines_of_code_Java                   -0.1098941
## max_lines_code_and_comment_Java            -0.1092429
## bytes_Java                                 -0.1088435
## total_lines_comment_Java                   -0.1083414
## PC 4
##                                                   coef
## pct_files_no_data_compiled                  -0.1825425
## pct_bytes_no_data_compiled                  -0.1691235
## pct_files_no_data_type_system_safe          -0.1584043
## pct_bytes_no_data_type_system_safe          -0.1528362
## pct_files_no_data_imperative                -0.1503853
## pct_bytes_no_data_imperative                -0.1409746
## pct_bytes_no_data_Python                    -0.1254706
## pct_files_no_data_array                      0.1209297
## pct_bytes_no_data_array                      0.1198673
## total_lines_comment_C_Cpp_Header             0.1177038
## total_lines_code_and_comment_C_Cpp_Header    0.1127964
## num_files_no_data                            0.1107477
## num_files_C_Cpp_Header                       0.1097586
## total_lines_of_code_C_Cpp_Header             0.1085011
## mean_bytes_per_line_code_and_comment_Python -0.1068548
## pct_files_no_data_type_system_unsafe         0.1037420
## pct_bytes_no_data_Java                      -0.1034078
## num_files                                    0.1015726
## pct_bytes_no_data_R                          0.1012059
## bytes_C_Cpp_Header                           0.1011909
## PC 5
##                                              coef
## pct_bytes_no_data_R                    -0.1448219
## pct_files_no_data_functional_impure    -0.1344128
## pct_bytes_no_data_functional_impure    -0.1314789
## pct_bytes_no_data_array                -0.1302764
## pct_files_no_data_array                -0.1263007
## total_files_type_system_static         -0.1221173
## pct_files_no_data_procedural           -0.1148500
## total_bytes_no_data_type_system_static -0.1132654
## total_files_compiled                   -0.1113646
## pct_files_no_data_object_oriented      -0.1067859
## total_files_compatibility_nominative   -0.1064765
## pct_bytes_no_data_object_oriented      -0.1057431
## num_files_Cpp                          -0.1044121
## pct_files_no_data_type_system_unsafe   -0.1039217
## pct_lines_comment_R                    -0.1032556
## total_files_type_system_unsafe         -0.1030084
## pct_files_no_data_compatibility_duck   -0.1029847
## pct_bytes_no_data_procedural           -0.1021319
## pct_bytes_no_data_compatibility_duck   -0.1020712
## total_lines_comment_Cpp                -0.1013010
## PC 6
##                                              coef
## bytes_SQL                              -0.1603115
## total_bytes_no_data_declarative        -0.1602020
## total_lines_code_and_comment_SQL       -0.1597201
## total_lines_of_code_SQL                -0.1596877
## total_lines_comment_SQL                -0.1592722
## num_files_SQL                          -0.1578101
## max_lines_code_SQL                     -0.1577875
## total_files_declarative                -0.1575398
## max_lines_code_and_comment_SQL         -0.1574855
## pct_bytes_test_cases_procedural         0.1379559
## pct_bytes_test_cases_object_oriented    0.1374439
## pct_lines_in_test_cases_no_data         0.1349042
## pct_bytes_in_test_cases_no_data         0.1321775
## pct_bytes_test_cases_imperative         0.1233329
## pct_bytes_test_cases_compiled           0.1145024
## pct_bytes_test_cases_functional_impure  0.1139793
## pct_bytes_test_cases_interpreted        0.1069488
## pct_bytes_test_cases_type_system_safe   0.1058571
## total_bytes_test_cases_object_oriented  0.1045796
## pct_bytes_no_data_type_system_dynamic  -0.0999194
## PC 7
##                                            coef
## total_bytes_no_data_declarative      -0.1687734
## bytes_SQL                            -0.1681989
## total_lines_comment_SQL              -0.1681696
## total_lines_code_and_comment_SQL     -0.1678177
## total_lines_of_code_SQL              -0.1676471
## max_lines_code_and_comment_SQL       -0.1659676
## max_lines_code_SQL                   -0.1656246
## total_files_declarative              -0.1628827
## num_files_SQL                        -0.1618883
## bytes_MATLAB                         -0.1314321
## total_lines_of_code_MATLAB           -0.1312552
## total_lines_code_and_comment_MATLAB  -0.1310628
## pct_bytes_test_cases_functional_pure -0.1308329
## total_lines_comment_MATLAB           -0.1273178
## total_bytes_no_data_array            -0.1269834
## total_files_array                    -0.1262827
## num_files_MATLAB                     -0.1200883
## max_lines_code_and_comment_MATLAB    -0.1168897
## commit_authors_no_gender             -0.1161183
## num_days_new_files_added             -0.1150610
## PC 8
##                                                        coef
## total_lines_code_and_comment_Bourne_Again_Shell -0.17578749
## max_lines_code_and_comment_Bourne_Again_Shell   -0.17510805
## total_lines_of_code_Bourne_Again_Shell          -0.17492393
## max_lines_code_Bourne_Again_Shell               -0.17403430
## total_lines_comment_Bourne_Again_Shell          -0.17192075
## bytes_Bourne_Again_Shell                        -0.16508086
## mean_lines_code_and_comment_Bourne_Again_Shell  -0.16474090
## mean_lines_code_Bourne_Again_Shell              -0.16118644
## mean_bytes_Bourne_Again_Shell                   -0.13569158
## total_lines_comment_C                           -0.11725312
## max_lines_code_and_comment_Python                0.11223789
## mean_bytes_per_line_code_and_comment_Python      0.10966338
## max_lines_code_Python                            0.10606161
## max_lines_code_C_Cpp_Header                      0.09994796
## pct_bytes_no_data_Python                         0.09940012
## mean_bytes_Python                                0.09938811
## mean_lines_code_and_comment_Python               0.09891870
## max_lines_code_and_comment_C_Cpp_Header          0.09827440
## total_bytes_test_cases_type_system_static       -0.09773522
## max_lines_code_and_comment_Cpp                   0.09704321
## PC 9
##                                                   coef
## total_lines_of_code_MATLAB                  -0.2046767
## total_lines_code_and_comment_MATLAB         -0.2039742
## bytes_MATLAB                                -0.2035720
## total_lines_comment_MATLAB                  -0.1973735
## num_files_MATLAB                            -0.1846725
## max_lines_code_and_comment_MATLAB           -0.1832486
## max_lines_code_MATLAB                       -0.1751083
## total_files_array                           -0.1697399
## total_bytes_no_data_array                   -0.1601974
## pct_bytes_no_data_MATLAB                    -0.1476883
## mean_lines_code_and_comment_MATLAB          -0.1431981
## mean_bytes_per_line_code_and_comment_MATLAB -0.1431630
## pct_files_no_data_imperative                -0.1405149
## mean_lines_code_MATLAB                      -0.1392645
## mean_bytes_MATLAB                           -0.1347921
## pct_lines_comment_MATLAB                    -0.1337496
## pct_bytes_no_data_Python                    -0.1304746
## pct_bytes_no_data_imperative                -0.1288551
## mean_lines_code_and_comment_Python          -0.1244374
## pct_files_no_data_type_system_unsafe         0.1226126
## PC 10
##                                                  coef
## bytes_PHP                                   0.1595617
## total_lines_of_code_PHP                     0.1591604
## total_lines_code_and_comment_PHP            0.1587388
## total_lines_comment_PHP                     0.1564654
## num_files_PHP                               0.1562980
## max_lines_code_PHP                          0.1309357
## max_lines_code_and_comment_PHP              0.1246946
## pct_bytes_no_data_PHP                       0.1200420
## total_files_declarative                     0.1166108
## num_files_SQL                               0.1165952
## max_lines_code_and_comment_Perl            -0.1153217
## max_lines_code_Perl                        -0.1152090
## total_lines_of_code_SQL                     0.1143251
## total_lines_code_and_comment_SQL            0.1142399
## bytes_SQL                                   0.1136413
## total_bytes_no_data_declarative             0.1132854
## total_lines_comment_SQL                     0.1132673
## pct_bytes_no_data_compatibility_nominative -0.1128934
## max_lines_code_SQL                          0.1101682
## max_lines_code_and_comment_SQL              0.1096996
## PC 11
##                                                  coef
## max_lines_code_Ruby                        0.28794091
## max_lines_code_and_comment_Ruby            0.28773649
## bytes_Ruby                                 0.28748905
## total_lines_code_and_comment_Ruby          0.28428503
## total_lines_of_code_Ruby                   0.28222816
## num_files_Ruby                             0.27883798
## total_lines_comment_Ruby                   0.27300780
## pct_bytes_no_data_Ruby                     0.19526333
## mean_bytes_per_line_code_and_comment_Ruby  0.13927621
## total_files_compatibility_duck             0.12597287
## total_files_type_system_dynamic            0.09540202
## mean_lines_code_and_comment_Ruby           0.09110303
## mean_lines_code_Ruby                       0.09042934
## pct_files_no_data_procedural              -0.08896505
## pct_bytes_no_data_procedural              -0.08887075
## mean_bytes_Ruby                            0.08531728
## total_files_interpreted                    0.08254316
## max_lines_code_and_comment_no_data        -0.08116563
## bytes_PHP                                 -0.07939042
## max_lines_code_no_data                    -0.07701542
## PC 12
##                                                 coef
## total_lines_of_code_Perl                   0.2094945
## total_lines_code_and_comment_Perl          0.2084690
## max_lines_code_Perl                        0.2041377
## num_files_Perl                             0.2024330
## max_lines_code_and_comment_Perl            0.2023879
## pct_bytes_no_data_Perl                     0.2007708
## total_lines_comment_Perl                   0.1994074
## bytes_Perl                                 0.1944453
## mean_bytes_per_line_code_and_comment_Perl  0.1587875
## mean_lines_code_and_comment_Perl           0.1577549
## mean_bytes_Perl                            0.1541149
## mean_lines_code_Perl                       0.1526362
## mean_lines_code_and_comment_no_data       -0.1481061
## mean_lines_code_no_data                   -0.1477807
## mean_lines_code_and_comment               -0.1444444
## mean_lines_code                           -0.1435089
## max_lines_code_and_comment                -0.1395865
## max_lines_code_and_comment_no_data        -0.1386585
## mean_file_size_no_data                    -0.1379620
## max_lines_code_no_data                    -0.1375359
## PC 13
##                                                       coef
## pct_bytes_no_data_Java                           0.1288739
## total_lines_of_code_Bourne_Shell                 0.1174483
## max_lines_code_Bourne_Shell                      0.1169153
## max_lines_code_and_comment_Bourne_Shell          0.1167828
## bytes_Bourne_Again_Shell                        -0.1155027
## total_lines_code_and_comment_Bourne_Shell        0.1154875
## mean_lines_code_and_comment_Bourne_Shell         0.1152790
## total_lines_code_and_comment_Bourne_Again_Shell -0.1150371
## mean_bytes_Bourne_Shell                          0.1146503
## total_lines_comment_Bourne_Again_Shell          -0.1145521
## mean_lines_code_Bourne_Shell                     0.1145095
## total_lines_of_code_Bourne_Again_Shell          -0.1139706
## bytes_Bourne_Shell                               0.1122968
## pct_lines_comment_Java                           0.1099680
## mean_bytes_per_line_code_and_comment_Java        0.1096952
## pct_files_no_data_type_system_static             0.1078512
## num_files_m4                                     0.1076396
## total_lines_comment_m4                           0.1063677
## commit_authors                                  -0.1047277
## max_lines_code_and_comment_Bourne_Again_Shell   -0.1024884
## PC 14
##                                                coef
## mean_lines_code_and_comment_no_data      0.13137172
## mean_lines_code_and_comment              0.12881710
## mean_lines_code_no_data                  0.12589387
## mean_lines_code                          0.12283369
## max_lines_code_and_comment_Ruby          0.12185156
## max_lines_code_Ruby                      0.12185138
## total_bytes_no_data_compatibility_duck  -0.11029549
## num_files_JavaScript                    -0.10943755
## max_lines_code_Perl                      0.10847442
## total_lines_code_and_comment_JavaScript -0.10769655
## mean_file_size_no_data                   0.10749119
## total_lines_of_code_JavaScript          -0.10746903
## max_lines_code_and_comment_Perl          0.10741917
## pct_bytes_test_cases_object_oriented     0.10523690
## pct_bytes_test_cases_interpreted         0.10147052
## mean_bytes_Perl                          0.10136976
## mean_lines_code_and_comment_Perl         0.10071275
## pct_bytes_test_cases_functional_impure   0.09895874
## mean_lines_code_Perl                     0.09884155
## pct_bytes_test_cases_compatibility_duck  0.09787587
## PC 15
##                                                         coef
## bytes_C_Cpp_Header                                -0.1265870
## pct_bytes_no_data_type_system_unsafe               0.1264224
## mean_lines_code_C                                  0.1263978
## total_lines_code_and_comment_Python                0.1262095
## total_lines_of_code_Python                         0.1254312
## max_lines_code_and_comment_C                       0.1251040
## total_lines_comment_Python                         0.1249964
## bytes_Python                                       0.1234337
## num_files_Python                                   0.1211313
## mean_lines_code_and_comment_C                      0.1209729
## total_lines_of_code_C_Cpp_Header                  -0.1201117
## pct_bytes_no_data_compiled                         0.1197629
## pct_bytes_no_data_C                                0.1197047
## mean_bytes_C                                       0.1171369
## total_lines_code_and_comment_C_Cpp_Header         -0.1169604
## num_files_C_Cpp_Header                            -0.1159057
## mean_bytes_per_line_code_and_comment_C_Cpp_Header  0.1150983
## mean_bytes_per_line_code_and_comment_C             0.1141447
## max_lines_code_C                                   0.1102222
## pct_bytes_no_data_Cpp                              0.1076800
## PC 16
##                                                 coef
## bytes_C_Cpp_Header                         0.1413859
## commit_authors_male                       -0.1285657
## total_lines_of_code_C_Cpp_Header           0.1279808
## num_files_Cpp                             -0.1266416
## bytes_PHP                                 -0.1239111
## commit_authors                            -0.1224591
## total_lines_code_and_comment_Cpp          -0.1222927
## total_lines_of_code_Cpp                   -0.1218565
## total_lines_comment_Cpp                   -0.1218335
## num_files_C_Cpp_Header                     0.1211502
## total_lines_code_and_comment_PHP          -0.1187777
## total_lines_of_code_PHP                   -0.1187686
## mean_new_files_per_day_with_new_files      0.1180444
## total_lines_comment_PHP                   -0.1177750
## total_lines_code_and_comment_C_Cpp_Header  0.1152143
## max_lines_code_PHP                        -0.1125345
## num_files_PHP                             -0.1115222
## total_files_type_system_unsafe            -0.1100359
## max_lines_code_and_comment_PHP            -0.1097612
## bytes_Cpp                                 -0.1089323
## PC 17
##                                                 coef
## pct_files_no_data_logic                   -0.1731891
## total_files_logic                         -0.1712570
## pct_bytes_no_data_logic                   -0.1699602
## total_bytes_no_data_logic                 -0.1669657
## pct_bytes_no_data_declarative             -0.1644858
## bytes_C_Cpp_Header                         0.1638024
## total_lines_of_code_C_Cpp_Header           0.1599349
## num_files_C_Cpp_Header                     0.1567673
## total_bytes_test_cases_logic              -0.1556881
## pct_files_no_data_declarative             -0.1549792
## total_lines_code_and_comment_C_Cpp_Header  0.1513034
## forks_count                                0.1478956
## subscribers_count                          0.1446753
## watchers_count                             0.1433364
## stargazers_count                           0.1433364
## commit_authors_male                        0.1327361
## commit_authors                             0.1295462
## total_lines_comment_C_Cpp_Header           0.1100994
## bytes_PHP                                  0.1051093
## commit_authors_no_gender                   0.1036532
## PC 18
##                                                    coef
## total_bytes_no_data_logic                   -0.27117980
## pct_bytes_no_data_logic                     -0.26673049
## total_files_logic                           -0.26444794
## pct_files_no_data_logic                     -0.25998989
## total_bytes_test_cases_logic                -0.24490609
## pct_bytes_no_data_declarative               -0.24133523
## pct_files_no_data_declarative               -0.22580749
## pct_bytes_no_data_object_oriented           -0.10873023
## consecutive_months_no_new_files_added       -0.10681401
## pct_bytes_no_data_functional_impure         -0.10542656
## consecutive_months_no_commits               -0.10198624
## commit_span_days                            -0.09490328
## pct_files_no_data_object_oriented           -0.09152597
## pct_months_new_files_added                   0.08613092
## mean_bytes_per_line_code_and_comment_MATLAB  0.08519153
## pct_files_no_data_functional_impure         -0.08399541
## mean_lines_code_MATLAB                       0.08387689
## mean_lines_code_and_comment_MATLAB           0.08329897
## pct_bytes_test_cases_declarative            -0.08227450
## mean_bytes_MATLAB                            0.08173162
## PC 19
##                                               coef
## total_lines_of_code_Python              -0.2091659
## total_lines_code_and_comment_Python     -0.2089519
## bytes_Python                            -0.2079769
## total_lines_comment_Python              -0.2019665
## num_files_Python                        -0.1870697
## pct_files_no_data_logic                 -0.1372176
## pct_bytes_no_data_logic                 -0.1353027
## total_bytes_no_data_logic               -0.1288058
## num_files_JavaScript                     0.1222123
## max_lines_code_Python                   -0.1210721
## max_lines_code_and_comment_Python       -0.1209686
## total_lines_comment_JavaScript           0.1153241
## total_lines_code_and_comment_JavaScript  0.1151029
## total_files_logic                       -0.1146095
## watchers_count                           0.1137620
## stargazers_count                         0.1137620
## total_lines_of_code_JavaScript           0.1118237
## total_bytes_test_cases_logic            -0.1111289
## pct_bytes_no_data_JavaScript             0.1106893
## pct_bytes_no_data_procedural             0.1070838
## PC 20
##                                                coef
## mean_lines_code_C                         0.1753989
## pct_bytes_no_data_C                       0.1748107
## mean_lines_code_and_comment_C             0.1736898
## mean_bytes_C                              0.1729926
## max_lines_code_and_comment_C              0.1713241
## max_lines_code_C                          0.1647866
## pct_files_no_data_logic                   0.1533043
## pct_bytes_no_data_logic                   0.1517994
## total_bytes_no_data_logic                 0.1464254
## total_files_logic                         0.1438111
## mean_bytes_Cpp                           -0.1402967
## mean_lines_code_and_comment_Cpp          -0.1392374
## watchers_count                            0.1370817
## stargazers_count                          0.1370817
## mean_lines_code_Cpp                      -0.1367879
## total_bytes_test_cases_logic              0.1347983
## pct_bytes_no_data_Cpp                    -0.1342564
## mean_bytes_per_line_code_and_comment_Cpp -0.1309554
## forks_count                               0.1293262
## subscribers_count                         0.1171088
## PC 21
##                                                       coef
## mean_bytes_JavaScript                           -0.1304911
## pct_files_no_data_procedural                     0.1267511
## pct_lines_comment_JavaScript                    -0.1265050
## mean_lines_code_and_comment_no_data              0.1195147
## mean_lines_code_no_data                          0.1182196
## max_lines_code_and_comment_Cpp                   0.1175830
## pct_bytes_no_data_imperative                     0.1163651
## total_files_compatibility_structural            -0.1162719
## pct_bytes_no_data_procedural                     0.1158719
## mean_lines_code_Perl                             0.1158127
## max_lines_code_Cpp                               0.1156713
## mean_lines_code_and_comment_Perl                 0.1150973
## mean_bytes_Perl                                  0.1136723
## mean_lines_code_and_comment                      0.1131873
## mean_lines_code                                  0.1115289
## pct_bytes_no_data_JavaScript                    -0.1104373
## mean_lines_code_and_comment_Cpp                  0.1088651
## mean_lines_code_and_comment_JavaScript          -0.1087886
## total_bytes_test_cases_compatibility_structural -0.1087170
## num_files_JavaScript                             0.1079117
## PC 22
##                                              coef
## mean_bytes_JavaScript                   0.1623488
## total_lines_comment_Perl               -0.1616739
## bytes_Perl                             -0.1514930
## mean_lines_code_and_comment_JavaScript  0.1471058
## total_lines_code_and_comment_Perl      -0.1452632
## mean_lines_code_JavaScript              0.1414519
## total_lines_of_code_Perl               -0.1384731
## total_lines_code_and_comment_Python     0.1368762
## total_lines_of_code_Python              0.1364775
## num_files_Python                        0.1361560
## total_lines_comment_Python              0.1340855
## bytes_Python                            0.1340143
## num_files_Perl                         -0.1292127
## mean_lines_code                        -0.1200868
## mean_lines_code_and_comment            -0.1172208
## mean_bytes_MATLAB                       0.1106038
## mean_lines_code_MATLAB                  0.1104077
## mean_lines_code_and_comment_MATLAB      0.1096151
## total_files_compatibility_structural    0.1083588
## mean_lines_code_Python                 -0.1073290
## PC 23
##                                                   coef
## mean_lines_code_and_comment_MATLAB           0.1694504
## mean_lines_code_MATLAB                       0.1664930
## mean_bytes_MATLAB                            0.1658783
## mean_bytes_per_line_code_and_comment_MATLAB  0.1651162
## total_bytes_test_cases_compatibility_duck   -0.1551043
## total_files_functional_pure                 -0.1498045
## total_bytes_no_data_functional_pure         -0.1478842
## num_files_Perl                               0.1441969
## pct_lines_comment_MATLAB                     0.1420074
## mean_lines_code_Perl                        -0.1389710
## mean_lines_code_and_comment_Perl            -0.1368041
## pct_bytes_no_data_MATLAB                     0.1356177
## bytes_Perl                                   0.1355503
## mean_bytes_Perl                             -0.1354813
## total_lines_of_code_Perl                     0.1348833
## total_lines_code_and_comment_Perl            0.1344322
## total_lines_comment_Perl                     0.1292255
## pct_files_no_data_functional_pure           -0.1290168
## total_bytes_test_cases_functional_pure      -0.1272304
## total_bytes_test_cases_interpreted          -0.1243680
## PC 24
##                                                 coef
## forks_count                               -0.2024822
## watchers_count                            -0.2011704
## stargazers_count                          -0.2011704
## total_lines_comment_Perl                  -0.1524145
## bytes_Perl                                -0.1505578
## subscribers_count                         -0.1483543
## total_lines_code_and_comment_Perl         -0.1456720
## total_lines_of_code_Perl                  -0.1420709
## num_files_Perl                            -0.1347029
## mean_bytes_per_line_code_and_comment_Perl  0.1285004
## total_files_functional_pure               -0.1269036
## total_bytes_no_data_functional_pure       -0.1262661
## num_files_Java                             0.1234245
## mean_bytes_Java                           -0.1210828
## mean_lines_code_Java                      -0.1195390
## bytes_Java                                 0.1190247
## total_lines_of_code_Java                   0.1183912
## mean_lines_code_and_comment_C              0.1176253
## mean_bytes_C                               0.1175120
## total_lines_code_and_comment_Java          0.1174786
## PC 25
##                                              coef
## mean_bytes_SQL                          0.2194527
## mean_lines_code_SQL                     0.2081194
## mean_lines_code_and_comment_SQL         0.2051570
## pct_bytes_no_data_SQL                   0.1423101
## pct_lines_comment_SQL                   0.1379925
## pct_bytes_no_data_JavaScript           -0.1362489
## mean_bytes_JavaScript                  -0.1353051
## total_lines_code_and_comment_R          0.1349216
## consecutive_months_no_new_files_added  -0.1334242
## total_lines_of_code_R                   0.1320191
## mean_lines_code_and_comment_JavaScript -0.1315816
## mean_lines_code_JavaScript             -0.1302130
## bytes_R                                 0.1245428
## total_lines_comment_R                   0.1219369
## max_lines_code_and_comment_R            0.1167297
## max_lines_code_R                        0.1141266
## consecutive_months_no_commits          -0.1111053
## pct_months_with_commits                 0.1071757
## pct_months_new_files_added              0.1063677
## pct_bytes_test_cases_declarative        0.1040184
## PC 26
##                                                        coef
## paper_authors                                    0.40678270
## paper_authors_male                               0.39700827
## paper_authors_female                             0.39212517
## paper_authors_no_gender                          0.38490639
## consecutive_months_no_commits                   -0.11518616
## consecutive_months_no_new_files_added           -0.11485460
## total_files_compatibility_structural             0.11454466
## total_bytes_no_data_compatibility_structural     0.10934267
## total_bytes_test_cases_compatibility_structural  0.10841887
## pct_bytes_no_data_compatibility_structural       0.10402379
## pct_files_no_data_compatibility_structural       0.09888705
## commit_span_days                                -0.09499199
## pct_months_with_commits                          0.09415847
## pct_months_new_files_added                       0.09160881
## mean_lines_code_and_comment_Ruby                -0.08695453
## mean_lines_code_Ruby                            -0.08556228
## mean_bytes_Ruby                                 -0.08438590
## mean_lines_code_Cpp                              0.07873498
## mean_lines_code_and_comment_Cpp                  0.07753349
## mean_bytes_Cpp                                   0.07226755
## PC 27
##                                                       coef
## paper_authors_female                            -0.2150938
## paper_authors                                   -0.2135144
## paper_authors_male                              -0.2112017
## paper_authors_no_gender                         -0.1786439
## total_files_compatibility_structural             0.1767714
## total_bytes_no_data_compatibility_structural     0.1753555
## total_bytes_test_cases_compatibility_structural  0.1653538
## pct_bytes_no_data_compatibility_structural       0.1560090
## pct_files_no_data_compatibility_structural       0.1441295
## mean_lines_code_and_comment_Ruby                -0.1392944
## mean_lines_code_Ruby                            -0.1381617
## mean_bytes_Ruby                                 -0.1371632
## mean_lines_code_C                               -0.1238222
## mean_lines_code_and_comment_C                   -0.1213037
## mean_bytes_C                                    -0.1203016
## mean_lines_code_and_comment_SQL                 -0.1195893
## mean_bytes_per_line_code_and_comment_Python      0.1162541
## mean_lines_code_SQL                             -0.1100360
## mean_bytes_SQL                                  -0.1099730
## pct_bytes_no_data_SQL                           -0.1077019
## PC 28
##                                                        coef
## total_files_functional_pure                      0.37126972
## total_bytes_no_data_functional_pure              0.36867341
## pct_files_no_data_functional_pure                0.32770144
## total_bytes_test_cases_functional_pure           0.29437131
## pct_bytes_no_data_functional_pure                0.25945570
## total_files_compatibility_structural            -0.16767811
## total_bytes_test_cases_compatibility_structural -0.16187075
## total_bytes_no_data_compatibility_structural    -0.15292638
## pct_bytes_no_data_compatibility_structural      -0.14724642
## pct_files_no_data_compatibility_structural      -0.13789560
## mean_lines_code_and_comment_Ruby                -0.10692404
## mean_lines_code_Ruby                            -0.10674689
## mean_bytes_Ruby                                 -0.10369559
## total_bytes_test_cases_compatibility_duck       -0.09339405
## mean_bytes_C                                     0.08865648
## mean_lines_code_and_comment_C                    0.08778915
## mean_lines_code_C                                0.08536127
## num_files_C                                     -0.07824557
## consecutive_months_no_commits                    0.07733337
## total_bytes_test_cases_type_system_dynamic      -0.07671748
## PC 29
##                                                               coef
## total_files_compatibility_structural                    -0.2291810
## total_bytes_no_data_compatibility_structural            -0.2135204
## total_files_functional_pure                             -0.2124434
## total_bytes_test_cases_compatibility_structural         -0.2108957
## total_bytes_no_data_functional_pure                     -0.2071883
## pct_bytes_no_data_compatibility_structural              -0.1982646
## pct_files_no_data_compatibility_structural              -0.1914380
## pct_files_no_data_functional_pure                       -0.1878174
## total_bytes_test_cases_functional_pure                  -0.1584035
## pct_bytes_no_data_functional_pure                       -0.1508498
## pct_bytes_test_cases_type_system_dynamic                 0.1418416
## mean_bytes_SQL                                          -0.1344065
## pct_bytes_test_cases_interpreted                         0.1331107
## mean_lines_code_SQL                                     -0.1265889
## mean_lines_code_and_comment_SQL                         -0.1226090
## num_files_C                                             -0.1134522
## pct_bytes_no_data_Bourne_Again_Shell                     0.1025358
## total_size_test_cases_no_data                           -0.1023627
## mean_bytes_per_line_code_and_comment_Bourne_Again_Shell  0.1016824
## pct_bytes_test_cases_compatibility_duck                  0.1001677
## PC 30
##                                                  coef
## mean_lines_code_and_comment_Ruby           0.31116030
## mean_lines_code_Ruby                       0.31070842
## mean_bytes_Ruby                            0.30655430
## mean_bytes_SQL                            -0.18674609
## mean_lines_code_and_comment_SQL           -0.17597570
## pct_bytes_no_data_SQL                     -0.17490493
## mean_bytes_per_line_code_and_comment_Ruby  0.16388850
## mean_lines_code_SQL                       -0.16339950
## pct_lines_comment_SQL                     -0.14858778
## mean_bytes_per_line_code_and_comment_SQL  -0.11789275
## total_bytes_test_cases_declarative        -0.11298883
## pct_bytes_no_data_Ruby                     0.11051518
## paper_authors                              0.10821138
## num_files_Ruby                            -0.10659311
## paper_authors_male                         0.10636621
## paper_authors_female                       0.10157874
## paper_authors_no_gender                    0.10116686
## pct_bytes_no_data_declarative             -0.09425818
## total_lines_of_code_Ruby                  -0.09217090
## total_lines_code_and_comment_Ruby         -0.09170198
## PC 31
##                                                    coef
## total_lines_code_and_comment_R               -0.2071848
## total_lines_of_code_R                        -0.2049381
## bytes_R                                      -0.1949590
## total_lines_comment_R                        -0.1829765
## total_bytes_test_cases_functional_impure     -0.1281493
## total_lines_test_cases_no_data               -0.1273825
## total_bytes_test_cases_procedural            -0.1234750
## mean_bytes_C                                 -0.1201605
## pct_months_new_files_added                   -0.1198662
## mean_lines_code_and_comment_C                -0.1192760
## consecutive_months_no_new_files_added         0.1181847
## mean_lines_code_C                            -0.1170970
## total_size_test_cases_no_data                -0.1162634
## total_bytes_no_data_compatibility_structural  0.1153091
## mean_bytes_per_line_code_and_comment_R        0.1149672
## pct_months_with_commits                      -0.1130381
## max_lines_code_R                             -0.1129662
## total_bytes_test_cases_object_oriented       -0.1120389
## pct_bytes_no_data_R                           0.1113223
## max_lines_code_and_comment_R                 -0.1097307
## PC 32
##                                              coef
## pct_months_new_files_added            -0.29820616
## consecutive_months_no_commits          0.29455420
## pct_months_with_commits               -0.29369018
## consecutive_months_no_new_files_added  0.27732772
## mean_lines_code_Cpp                   -0.18454415
## mean_lines_code_and_comment_Cpp       -0.18248042
## mean_bytes_Cpp                        -0.18205409
## commit_span_days                       0.17487845
## mean_bytes_Ruby                       -0.12902264
## mean_lines_code_Ruby                  -0.12649006
## mean_lines_code_and_comment_Ruby      -0.12469452
## mean_commits_per_month                -0.10547773
## total_lines_code_and_comment_R         0.09845219
## total_lines_of_code_R                  0.09418940
## total_lines_comment_R                  0.09312011
## paper_authors_male                     0.09188496
## bytes_R                                0.09085796
## max_lines_code_R                       0.08890235
## paper_authors                          0.08872460
## max_lines_code_Cpp                    -0.08780826
## PC 33
##                                                               coef
## mean_lines_code_Cpp                                      0.2423072
## mean_lines_code_and_comment_Cpp                          0.2393224
## mean_bytes_Cpp                                           0.2359273
## pct_months_new_files_added                              -0.1869989
## pct_months_with_commits                                 -0.1788101
## consecutive_months_no_commits                            0.1757697
## consecutive_months_no_new_files_added                    0.1741110
## mean_bytes_per_line_code_and_comment_Bourne_Again_Shell  0.1296733
## pct_lines_comment_Bourne_Again_Shell                     0.1210644
## mean_commits_per_month                                  -0.1202588
## total_bytes_test_cases_type_system_safe                  0.1159793
## pct_bytes_test_cases_compatibility_duck                  0.1148259
## total_bytes_test_cases_compatibility_duck                0.1148251
## max_lines_code_Cpp                                       0.1109134
## max_lines_code_and_comment_C_Cpp_Header                 -0.1080308
## mean_lines_code_C_Cpp_Header                            -0.1076116
## mean_lines_code_and_comment_C_Cpp_Header                -0.1054239
## mean_bytes_C_Cpp_Header                                 -0.1031514
## total_bytes_test_cases_interpreted                       0.1031040
## total_bytes_test_cases_type_system_dynamic               0.1028638
## PC 34
##                                                coef
## commits                                   0.1760309
## commits_no_gender                         0.1602065
## mean_lines_code_and_comment_Java         -0.1564970
## mean_lines_code_Java                     -0.1534757
## mean_bytes_Java                          -0.1527743
## mean_lines_code_C_Cpp_Header             -0.1465496
## mean_bytes_C_Cpp_Header                  -0.1449982
## commit_span_days                          0.1443712
## mean_commits_per_month                    0.1418592
## mean_lines_code_and_comment_C_Cpp_Header -0.1416119
## max_lines_code_and_comment_C_Cpp_Header  -0.1370157
## commits_male                              0.1352525
## commits_female                            0.1338220
## forks_count                              -0.1314032
## pct_bytes_test_cases_type_system_static  -0.1308207
## watchers_count                           -0.1263181
## stargazers_count                         -0.1263181
## max_lines_code_C_Cpp_Header              -0.1262598
## mean_day_new_files_added                  0.1246620
## mean_lines_code_and_comment_C            -0.1211235
## PC 35
##                                                     coef
## mean_lines_code_Java                          -0.2493039
## mean_lines_code_and_comment_Java              -0.2478298
## mean_bytes_Java                               -0.2383029
## total_bytes_test_cases_compatibility_duck     -0.1594534
## mean_bytes_Ruby                                0.1441586
## pct_months_with_commits                       -0.1427153
## mean_lines_code_Ruby                           0.1387760
## mean_lines_code_and_comment_Ruby               0.1374266
## pct_bytes_test_cases_compatibility_nominative  0.1352331
## pct_months_new_files_added                    -0.1286739
## mean_bytes_PHP                                -0.1184583
## pct_bytes_test_cases_type_system_static        0.1184508
## mean_lines_code_PHP                           -0.1184259
## consecutive_months_no_commits                  0.1162435
## mean_lines_code_and_comment_PHP               -0.1118598
## pct_bytes_test_cases_type_system_unsafe        0.1110502
## mean_bytes_Python                             -0.1090660
## max_lines_code_Java                           -0.1048985
## total_bytes_test_cases_type_system_dynamic    -0.1047449
## mean_commits_per_month                        -0.1030173
## PC 36
##                                                               coef
## mean_bytes_C                                            -0.1970224
## mean_lines_code_and_comment_C                           -0.1946505
## mean_lines_code_C_Cpp_Header                             0.1928191
## mean_lines_code_and_comment_C_Cpp_Header                 0.1902177
## num_citations_pmc                                        0.1891453
## mean_bytes_C_Cpp_Header                                  0.1858760
## mean_lines_code_C                                       -0.1855156
## num_citations_per_week_pmc_minus_2_years                 0.1466377
## pct_bytes_test_cases_type_system_static                 -0.1417717
## mean_bytes_per_line_code_and_comment_Bourne_Again_Shell -0.1371336
## mean_lines_code_and_comment_PHP                         -0.1343874
## mean_lines_code_PHP                                     -0.1334128
## mean_bytes_PHP                                          -0.1295642
## pct_bytes_no_data_compatibility_structural              -0.1293539
## num_files_Bourne_Again_Shell                            -0.1194878
## pct_files_no_data_compatibility_structural              -0.1183791
## pct_bytes_no_data_C_Cpp_Header                           0.1150331
## pct_bytes_test_cases_compatibility_nominative           -0.1137080
## pct_bytes_test_cases_type_system_dynamic                 0.1136779
## pct_bytes_test_cases_interpreted                         0.1083133
## PC 37
##                                                  coef
## mean_bytes_PHP                            -0.33411412
## mean_lines_code_PHP                       -0.33021284
## mean_lines_code_and_comment_PHP           -0.31523397
## mean_lines_code_Java                       0.18100889
## mean_lines_code_and_comment_Java           0.17832566
## mean_bytes_Java                            0.17426452
## total_lines_comment_PHP                    0.13235263
## num_files_PHP                              0.12639347
## total_bytes_test_cases_compatibility_duck -0.12463262
## total_lines_code_and_comment_PHP           0.11989389
## total_lines_of_code_PHP                    0.11362213
## mean_bytes_per_line_code_and_comment_PHP  -0.10757471
## mean_bytes_C_Cpp_Header                   -0.10158674
## pct_lines_comment_PHP                     -0.10075777
## mean_lines_code_C_Cpp_Header              -0.09594984
## mean_lines_code_Python                     0.09513806
## commits_no_gender                          0.09454034
## mean_lines_code_and_comment_Python         0.09382838
## commits                                    0.09204324
## pct_bytes_test_cases_compatibility_duck   -0.09097073
## PC 38
##                                                               coef
## mean_lines_code_PHP                                      0.2112546
## mean_bytes_PHP                                           0.2099476
## mean_lines_code_and_comment_PHP                          0.2076452
## mean_lines_code_Cpp                                      0.1582911
## mean_lines_code_and_comment_Cpp                          0.1553876
## mean_bytes_Cpp                                           0.1542720
## num_files_Bourne_Again_Shell                            -0.1521207
## mean_bytes_per_line_code_and_comment_Bourne_Again_Shell -0.1399263
## mean_lines_code_and_comment_JavaScript                  -0.1316971
## mean_lines_code_JavaScript                              -0.1304612
## total_bytes_test_cases_type_system_dynamic              -0.1278751
## total_bytes_test_cases_interpreted                      -0.1260391
## mean_bytes_JavaScript                                   -0.1223414
## pct_bytes_no_data_Bourne_Again_Shell                    -0.1181020
## total_lines_of_code_C                                    0.1171636
## total_lines_code_and_comment_C                           0.1154156
## num_citations_per_week_pmc_minus_2_years                 0.1151903
## bytes_C                                                  0.1136074
## pct_lines_comment_no_data                               -0.1110109
## pct_lines_comment                                       -0.1084579
## PC 39
##                                                               coef
## mean_lines_code_C_Cpp_Header                            -0.2103984
## mean_bytes_C_Cpp_Header                                 -0.2099714
## mean_lines_code_and_comment_C_Cpp_Header                -0.1945452
## mean_bytes_per_line_code_and_comment_Bourne_Again_Shell -0.1705914
## max_lines_code_C_Cpp_Header                             -0.1556282
## max_lines_code_and_comment_C_Cpp_Header                 -0.1469572
## pct_bytes_no_data_Bourne_Again_Shell                    -0.1461424
## pct_bytes_no_data_Cpp                                    0.1425066
## mean_bytes_Python                                        0.1395930
## mean_lines_code_Python                                   0.1351665
## mean_lines_code_and_comment_Python                       0.1350222
## pct_lines_comment_Bourne_Again_Shell                    -0.1344011
## mean_bytes_R                                             0.1208725
## pct_lines_comment_C_Cpp_Header                           0.1198925
## pct_bytes_no_data_type_system_unsafe                     0.1146158
## mean_lines_code_R                                        0.1138313
## num_files_Bourne_Again_Shell                            -0.1106436
## mean_lines_code_and_comment_R                            0.1048123
## mean_bytes_C                                            -0.1044362
## mean_lines_code_C                                       -0.1023463
## PC 40
##                                                      coef
## mean_lines_code_R                              0.34099347
## mean_bytes_R                                   0.31644484
## mean_lines_code_and_comment_R                  0.30744375
## num_files_R                                   -0.23243282
## pct_lines_comment_no_data                     -0.21141378
## pct_lines_comment                             -0.20797266
## max_lines_code_R                               0.16140220
## pct_bytes_test_cases_compatibility_structural  0.15877192
## total_lines_comment_R                         -0.14386823
## max_lines_code_and_comment_R                   0.14217885
## pct_bytes_test_cases_logic                     0.13051961
## pct_bytes_test_cases_declarative               0.12579660
## pct_lines_comment_R                           -0.12427783
## total_lines_code_and_comment_R                -0.11050272
## bytes_R                                       -0.10942105
## mean_lines_code_C_Cpp_Header                   0.10803861
## pct_bytes_test_cases_imperative                0.10791887
## mean_bytes_C_Cpp_Header                        0.10315820
## num_files_Bourne_Shell                        -0.10187908
## mean_lines_code_and_comment_C_Cpp_Header       0.09681007
## PC 41
##                                                    coef
## pct_bytes_no_data_compatibility_structural    0.2603736
## pct_files_no_data_compatibility_structural    0.2459516
## pct_bytes_no_data_m4                          0.1908717
## num_files_C                                  -0.1872596
## mean_lines_code_Java                          0.1666254
## total_lines_of_code_C                        -0.1612316
## mean_lines_code_and_comment_Java              0.1581595
## mean_bytes_C                                  0.1547738
## mean_lines_code_and_comment_C                 0.1532020
## mean_bytes_Java                               0.1487645
## mean_lines_code_C                             0.1479225
## num_files_Bourne_Shell                       -0.1449386
## total_lines_code_and_comment_C               -0.1405180
## bytes_C                                      -0.1309369
## mean_commit_message_len                      -0.1282991
## median_commit_message_len                    -0.1214323
## total_bytes_no_data_compatibility_structural  0.1189608
## pct_lines_comment_no_data                    -0.1151939
## num_files_Bourne_Again_Shell                 -0.1118813
## mean_lines_code_m4                            0.1096756
## PC 42
##                                                                coef
## pct_bytes_test_cases_logic                               0.44635461
## pct_bytes_test_cases_declarative                         0.40151918
## pct_bytes_test_cases_functional_pure                     0.29542945
## num_citations_pmc                                        0.25510700
## num_citations_per_week_pmc_minus_2_years                 0.23014305
## total_bytes_test_cases_declarative                       0.16566858
## num_non_committing_authors                               0.14222673
## pct_bytes_test_cases_functional_impure                  -0.10712140
## mean_bytes_per_line_code_and_comment_Bourne_Again_Shell  0.10124580
## pct_bytes_no_data_Bourne_Again_Shell                     0.09932908
## num_files_Bourne_Again_Shell                             0.09865289
## mean_commit_message_len                                  0.09491972
## pct_bytes_test_cases_compatibility_duck                 -0.09410354
## total_bytes_test_cases_logic                             0.09200106
## pct_lines_comment_SQL                                   -0.08895700
## pct_bytes_no_data_logic                                 -0.08815512
## pct_files_no_data_logic                                 -0.08577923
## watchers_count                                          -0.08384819
## stargazers_count                                        -0.08384819
## pct_bytes_test_cases_type_system_dynamic                -0.07963399
## PC 43
##                                                               coef
## shannon_commit_author_gender                             0.2931185
## shannon_commits_gender                                   0.2788318
## mean_bytes_per_line_code_and_comment_Bourne_Shell        0.1991365
## mean_bytes_per_line_code_and_comment_Bourne_Again_Shell -0.1712476
## num_files_Bourne_Again_Shell                            -0.1644507
## mean_bytes_Bourne_Shell                                  0.1642314
## mean_bytes_per_line_code_and_comment_no_data             0.1598235
## pct_lines_comment_Bourne_Again_Shell                    -0.1570568
## pct_bytes_test_cases_logic                               0.1515720
## pct_bytes_no_data_Bourne_Shell                           0.1493897
## commits_female                                           0.1384271
## commit_authors_female                                    0.1381689
## mean_file_size                                           0.1373466
## pct_lines_comment_Cpp                                    0.1308229
## pct_bytes_no_data_Bourne_Again_Shell                    -0.1200563
## pct_bytes_test_cases_functional_pure                     0.1195572
## mean_lines_code_Bourne_Shell                             0.1188103
## mean_lines_code_and_comment_Bourne_Shell                 0.1173803
## pct_bytes_test_cases_declarative                         0.1157019
## mean_bytes_per_line_code_and_comment                     0.1131613

Look at score association with repo groupings

# Function to find PCs associated with a column/vector of repo measurements
associated_pcs <- function(repo_obs) {
  scores <- pca_res$scores
  # Initialize return df
  if(is.logical(repo_obs)) {
    df <- data.frame(PC = integer(), pval = numeric(), median_true = numeric(), median_false = numeric())
  } else {
    if(is.numeric(repo_obs)) {
      df <- data.frame(PC = integer(), pval = numeric(), estimate = numeric())
    }
  }
  # Look at scores for each PC
  for(j in 1:ncomp_keep) {
    data <- data.frame(repo_obs = repo_obs, scores = scores[,j])
    # Binary repo feature
    if(is.logical(repo_obs)) {
      res <- wilcox.test(scores ~ repo_obs, data = data)
      p <- res$p.value
      if(p < 0.0001) {
        df <- rbind(df, data.frame(PC = j, pval = p, median_true = median(data[which(data$repo_obs), "scores"]), 
                                   median_false =  median(data[which(!data$repo_obs), "scores"])))
      }
    } else {
      # Numeric repo feature
      if(is.numeric(repo_obs)) {
        res <- lm(scores ~ repo_obs, data = data)
        p <- summary(res)$coefficients["repo_obs",4]
        estimate <- summary(res)$coefficients["repo_obs",1]
        if(p < 0.0001) {
          df <- rbind(df, data.frame(PC = j, pval = p, estimate = estimate))
        }
      } else {
        stop("Data type not supported")
      } 
    }
  }
  df %>% arrange(pval)
}

# Display some associations
associated_pcs(repo_data_all$is_high_profile)
##    PC         pval median_true median_false
## 1   1 1.876491e-12   -7.015125  1.767586075
## 2  10 5.817301e-12   -3.733818  0.049158817
## 3  42 1.231610e-11    2.548866  0.028518650
## 4   6 4.466402e-10    3.488499 -0.411864116
## 5   7 3.935438e-09   -3.576013  0.188884357
## 6  36 3.449137e-08    3.861506  0.026719690
## 7  16 1.935552e-07   -2.953724 -0.102268413
## 8  23 3.194646e-07    1.620941 -0.008419759
## 9   3 4.824076e-07   -3.076207  0.271779893
## 10 28 5.093561e-07    1.061539 -0.033890993
## 11 13 5.630509e-06   -3.090234 -0.112774488
## 12 17 5.742936e-06    2.250737  0.162178664
## 13  9 2.206802e-05    1.657870  0.597585522
## 14 38 2.932533e-05    1.485005  0.014469707
associated_pcs(repo_data_all$total_file_size)
##    PC          pval      estimate
## 1   1 2.813028e-234 -7.267761e-07
## 2   4  3.536057e-22  1.306006e-07
## 3  16  2.543945e-12  6.642074e-08
## 4   3  1.621018e-09 -8.756602e-08
## 5   6  1.763092e-08 -7.311027e-08
## 6  15  8.796145e-08 -5.259307e-08
## 7  17  3.764442e-07  4.664127e-08
## 8   2  1.504949e-05 -7.665350e-08
## 9   7  4.014657e-05  5.158842e-08
## 10 19  9.586471e-05 -3.428360e-08
associated_pcs(repo_data_all$num_citations_pmc)
##    PC         pval      estimate
## 1  42 8.127347e-66  0.0009613828
## 2  36 2.505215e-39  0.0009506985
## 3   6 8.543879e-14  0.0011396090
## 4  33 3.668741e-13  0.0005642248
## 5   1 4.946870e-12 -0.0020956147
## 6  10 4.274431e-11 -0.0008689062
## 7  38 8.660794e-10  0.0004264397
## 8  20 3.288712e-09  0.0005980542
## 9  16 2.115919e-08 -0.0006315172
## 10 43 1.432503e-07 -0.0003197448
## 11  7 3.437949e-07 -0.0007511429
## 12 13 2.885783e-06 -0.0005741770
## 13  3 3.410662e-06 -0.0008033663
## 14 17 7.810375e-05  0.0004337214
associated_pcs(repo_data_all$stargazers_count)
##    PC         pval      estimate
## 1  24 2.062986e-68 -0.0028663220
## 2  17 1.069134e-50  0.0030206950
## 3  20 3.943133e-39  0.0024483312
## 4  19 3.872706e-29  0.0021909354
## 5  14 4.399826e-24 -0.0023263012
## 6  10 2.088639e-23 -0.0024659401
## 7  13 9.140973e-23 -0.0022693015
## 8  34 3.254257e-19 -0.0012780708
## 9  21 1.330528e-16  0.0014918342
## 10 16 3.462218e-15 -0.0016842740
## 11  2 6.832548e-15 -0.0030984095
## 12  6 4.770214e-12  0.0020206905
## 13  7 2.723023e-10 -0.0017845240
## 14 35 4.081460e-09  0.0008273181
## 15  1 1.501269e-08 -0.0032794976
## 16 42 7.372890e-07 -0.0005974368
## 17 11 3.608204e-05  0.0010061961
associated_pcs(repo_data_all$watchers_count)
##    PC         pval      estimate
## 1  24 2.062986e-68 -0.0028663220
## 2  17 1.069134e-50  0.0030206950
## 3  20 3.943133e-39  0.0024483312
## 4  19 3.872706e-29  0.0021909354
## 5  14 4.399826e-24 -0.0023263012
## 6  10 2.088639e-23 -0.0024659401
## 7  13 9.140973e-23 -0.0022693015
## 8  34 3.254257e-19 -0.0012780708
## 9  21 1.330528e-16  0.0014918342
## 10 16 3.462218e-15 -0.0016842740
## 11  2 6.832548e-15 -0.0030984095
## 12  6 4.770214e-12  0.0020206905
## 13  7 2.723023e-10 -0.0017845240
## 14 35 4.081460e-09  0.0008273181
## 15  1 1.501269e-08 -0.0032794976
## 16 42 7.372890e-07 -0.0005974368
## 17 11 3.608204e-05  0.0010061961
associated_pcs(repo_data_all$first_author_gender == "female")
##   PC         pval median_true median_false
## 1 43 5.946032e-08   0.1217638   -0.1555944
## 2  1 3.155578e-05   2.1675745    1.5531409
associated_pcs(repo_data_all$last_author_gender == "female")
## [1] PC           pval         median_true  median_false
## <0 rows> (or 0-length row.names)
associated_pcs(repo_data_all$shannon_commit_author_gender)
##   PC         pval  estimate
## 1 43 3.156426e-50  4.560701
## 2 34 1.327683e-09  1.769080
## 3 41 2.155488e-09 -1.679837

Make biplots

# Make biplots
for(cp in 1:ncomp_keep) {
  plt <- ggbiplot(pca_res, obs.scale = 1, var.scale = 1, choices = c(cp, cp+1),
                  groups = repo_data_all$is_high_profile, ellipse = TRUE, 
                  var.axes = FALSE, alpha = 0.2) + 
    scale_color_discrete(name = '') + 
    theme(legend.direction = 'horizontal', legend.position = 'top')
  print(plt)
}

Session info

sessionInfo()
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.2
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] jsonlite_1.5     factoextra_1.0.5 ggbiplot_0.55    scales_0.5.0    
## [5] plyr_1.8.4       ggplot2_2.2.1    dplyr_0.7.4     
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.14     ggpubr_0.1.6     knitr_1.18       bindr_0.1       
##  [5] magrittr_1.5     munsell_0.4.3    colorspace_1.3-2 R6_2.2.2        
##  [9] rlang_0.1.6      stringr_1.2.0    tools_3.4.3      gtable_0.2.0    
## [13] htmltools_0.3.6  lazyeval_0.2.1   yaml_2.1.16      assertthat_0.2.0
## [17] rprojroot_1.3-2  digest_0.6.13    tibble_1.4.1     bindrcpp_0.2    
## [21] ggrepel_0.7.0    glue_1.2.0       evaluate_0.10.1  rmarkdown_1.8   
## [25] labeling_0.3     stringi_1.1.6    compiler_3.4.3   pillar_1.0.1    
## [29] backports_1.1.2  pkgconfig_2.0.1